import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np
from scipy import stats
import sys
sys.path.append(sys.argv[1])

import pandas as pd
import numpy as np
import pickle

# Load the dataset
atp_tennis = pd.read_csv(os.path.join(sys.argv[1], 'atp_tennis.csv'))

# Convert the 'Date' column to datetime format
atp_tennis['Date'] = pd.to_datetime(atp_tennis['Date'])

# Find the last date in the dataset
last_date = atp_tennis['Date'].max()

# Filter the dataset to only include matches from the last five years
five_years_ago = last_date - pd.DateOffset(years=5)
recent_matches = atp_tennis[atp_tennis['Date'] >= five_years_ago]

# Find the top 10 ranked players
top_10_ranked_players = pd.concat([recent_matches[['Player_1', 'Rank_1']], recent_matches[['Player_2', 'Rank_2']].rename(columns={'Player_2': 'Player_1', 'Rank_2': 'Rank_1'})])
top_10_ranked_players = top_10_ranked_players.groupby('Player_1').min().sort_values('Rank_1').head(10)

# Convert the top 10 ranked players and their rankings to a Python list
top_10_ranked_players_list = top_10_ranked_players.reset_index().values.tolist()

print(top_10_ranked_players_list)
# pickle.dump(top_10_ranked_players_list,open("./ref_result/top_10_ranked_players_list.pkl","wb"))

import pandas as pd
import numpy as np
import pickle


# Remove any extra spaces from the player names
top_10_player_names = [player[0].strip() for player in top_10_ranked_players_list]
# top_10_player_names = [player.strip() for player in top_10_ranked_players_list]

# Filter the dataset to only include grass court matches played by the top 10 players in the last five years
grass_matches = recent_matches[(recent_matches['Surface'] == 'Grass') & (recent_matches['Player_1'].isin(top_10_player_names) | recent_matches['Player_2'].isin(top_10_player_names))]

# Calculate the number of matches played by each player
matches_played = grass_matches['Player_1'].value_counts() + grass_matches['Player_2'].value_counts()
matches_played = matches_played[top_10_player_names].fillna(0)

# Calculate the number of matches won by each player
matches_won = grass_matches['Winner'].value_counts()
matches_won = matches_won[top_10_player_names].fillna(0)

# Calculate the win rate percentage for each player
win_rate_percentage = (matches_won / matches_played) * 100

# Create a data table with player names, number of matches played, number of matches won, and win rate percentage
win_rate_table = pd.DataFrame({'Player': top_10_player_names, 'Matches Played': matches_played, 'Matches Won': matches_won, 'Win Rate Percentage': win_rate_percentage})
result = win_rate_table.reset_index(drop=True)

print(win_rate_table)
# pickle.dump(win_rate_table,open("./ref_result/win_rate_table.pkl","wb"))

import pandas as pd
import numpy as np
import pickle


# Filter the dataset to only include grass court matches played between the top 10 players in the last five years
head_to_head_matches = grass_matches[(grass_matches['Player_1'].isin(top_10_player_names)) & (grass_matches['Player_2'].isin(top_10_player_names))]

# Create an empty matrix table to store the head-to-head win-loss records
head_to_head_matrix = pd.DataFrame(np.zeros((10, 10)), columns=top_10_player_names, index=top_10_player_names)

# Populate the matrix table with head-to-head win-loss records
for _, match in head_to_head_matches.iterrows():
 winner = match['Winner']
 loser = match['Player_1'] if match['Player_1'] != winner else match['Player_2']
 head_to_head_matrix.loc[winner, loser] += 1

print(head_to_head_matrix)
# pickle.dump(head_to_head_matrix,open("./ref_result/head_to_head_matrix.pkl","wb"))

import pandas as pd
import numpy as np
import pickle


rounds = ['Early Round', 'Quarterfinals', 'Semifinals', 'Final']

# Initialize an empty list to store the performance data
performance_data_list = []

# Calculate the performance data for each player in each round
for player in top_10_player_names:
    for round_name in rounds:
        if round_name == 'Early Round':
            # Filter the dataset to only include matches played by the player in the early rounds
            player_round_matches = grass_matches[((grass_matches['Round'] == '1st Round') | (grass_matches['Round'] == '2nd Round') | (grass_matches['Round'] == '3rd Round') | (grass_matches['Round'] == '4th Round')) & ((grass_matches['Player_1'] == player) | (grass_matches['Player_2'] == player))]
        else:
            # Filter the dataset to only include matches played by the player in the specific round
            player_round_matches = grass_matches[(grass_matches['Round'] == round_name) & ((grass_matches['Player_1'] == player) | (grass_matches['Player_2'] == player))]

        # Calculate the number of matches played
        matches_played = len(player_round_matches)

        # Calculate the number of matches won
        matches_won = len(player_round_matches[player_round_matches['Winner'] == player])

        # Calculate the win rate
        win_rate = (matches_won / matches_played) * 100 if matches_played > 0 else 0

        # Append the performance data to the list
        performance_data_list.append({'Player': player, 'Round': round_name, 'Matches Played': matches_played, 'Matches Won': matches_won, 'Win Rate': win_rate})

# Convert the list to a DataFrame
performance_data = pd.DataFrame(performance_data_list)

print(performance_data)
# pickle.dump(performance_data,open("./ref_result/performance_data.pkl","wb"))



import pandas as pd
import numpy as np
import pickle


# Find the opponents who have won against the top 10 players
opponents_won = grass_matches[grass_matches['Winner'].isin(top_10_player_names) == False]['Winner'].value_counts()

# Filter the opponents who have won at least 3 matches against the top 10 players
notable_opponents = opponents_won[opponents_won >= 3]

# Convert the notable opponents and the number of matches they have won to a Python list
notable_opponents_list = notable_opponents.reset_index().values.tolist()

print(notable_opponents_list)
pickle.dump(notable_opponents_list,open("./ref_result/notable_opponents_list.pkl","wb"))